library(ggplot2)
library(data.table)
library(scales)
library(RColorBrewer)
library(reticulate)
library(uwot)
## Loading required package: Matrix
library(gridExtra)
use_condaenv(condaenv = 'py37', required = TRUE)
data_dir <- here::here('..','..','data')
load(file.path(data_dir, 'diff.sampled.Rdata'))
diff_dt <- fread(file.path(data_dir, 'diff.sampled.csv.gz'))
car_set <- c('41BB','CD28','BAFF-R','CD40','TACI')
skip_umap_param_search = T
censor_negative_min = -1500
# map day 0 to both plus and minus
map_day_0 <- function(df) {
return(rbind(
df[k562!='none'],
df[k562=='none'][, k562 := 'cd19+'],
df[k562=='none'][, k562 := 'cd19-']
))
}
For now, skip straight to high dimensional plotting.
First, find parameters with a small sample.
umap_dt <- diff_dt[!is.na(event_id),
.SD[event_id %in% sample(unique(event_id),
min(20, length(unique(event_id))))],
by=c('well', 'plate', 'day')]
#grab channel map from cd4 data (cd4/8 are the same)
channel_map <- diff_opt_cd4$channel_map
# for now use all variables in the channel map except cd4/8
umap_vars <- c(paste0(names(channel_map),'_t'))
umap_vars <- umap_vars[umap_vars != 'cd_t']
#cast it so variables are columns and
#subset sample umap data on variables
umap_cast_dt <- dcast(umap_dt,
event_id + well + plate + day + t_type ~ variable,
value.var='value')
umap_dt_in <- umap_cast_dt[, ..umap_vars]
# for points that are very negative, trim the to below the cutoff
umap_dt_in[umap_dt_in < censor_negative_min] <- censor_negative_min
# scale each input column
umap_dt_in[ ,
(names(umap_dt_in)) := lapply(.SD, scale), .SDcols = names(umap_dt_in)]
# check scales:
ggplot(melt(cbind(umap_dt_in, id=1:nrow(umap_dt_in)), id.var='id'),
aes(x=value)) +
geom_density() +
facet_grid(variable~.) +
scale_fill_distiller(palette='RdYlBu') +
theme_bw()
#umap parameter list
min_dist_set <- c(0.0001, 0.005, 0.1)
n_neighbor_set <- c(3,6,15,30)
umap_params <- data.table(expand.grid(min_dist_set, n_neighbor_set))
# run umap via uwot library
umap_out <- umap_params[, {
print(paste0('Running: ',.BY));
as.data.table(umap(umap_dt_in, min_dist=as.numeric(.BY[1]),
n_neighbors=as.numeric(.BY[2]), verbose=F, n_threads=8, n_trees=50))
}, by=names(umap_params)]
## [1] "Running: 1e-04" "Running: 3"
## [1] "Running: 0.005" "Running: 3"
## [1] "Running: 0.1" "Running: 3"
## [1] "Running: 1e-04" "Running: 6"
## [1] "Running: 0.005" "Running: 6"
## [1] "Running: 0.1" "Running: 6"
## [1] "Running: 1e-04" "Running: 15"
## [1] "Running: 0.005" "Running: 15"
## [1] "Running: 0.1" "Running: 15"
## [1] "Running: 1e-04" "Running: 30"
## [1] "Running: 0.005" "Running: 30"
## [1] "Running: 0.1" "Running: 30"
# rename the columns
names(umap_out) <- c('min_dist','n_neighbor','umap_1','umap_2')
# add the umap output to the input dt
umap_fcs_dt <- cbind(umap_cast_dt[, 1:length(names(umap_cast_dt))], umap_out)
umap_fcs_dt[, id := 1:.N]
umap_fcs_dt <- unique(umap_dt[,
.(donor, car, k562, t_type, day, well, event_id)])[
umap_fcs_dt, on=.(t_type,well,day,event_id)]
color_points <- ggplot(umap_fcs_dt[car %in% c('CD28','41BB','KLRG1','BAFF-R','Zeta')],
aes(x=umap_1, y=umap_2, color=interaction(car, t_type))) +
geom_point(size=0.1, alpha=0.1) +
guides(colour = guide_legend(override.aes = list(alpha=1, size=3))) +
facet_grid(min_dist~n_neighbor) +
theme_bw()
color_density <- ggplot(umap_fcs_dt, aes(x=umap_1, y=umap_2)) +
geom_hex(bins = 70) +
scale_fill_continuous(
type = "viridis", limits=c(0,30), oob=scales::squish) +
facet_grid(min_dist~n_neighbor) +
theme_bw()
grid.arrange(color_points, color_density, ncol=2)
Choosing neighbors == 15 and min_dist == 1e-4.
Scaling this to 200 events per well and checking CAR/condition separation.
chosen_dist=0.005
chosen_n_neighbor=15
umap_dt <- diff_dt[!is.na(event_id),
.SD[event_id %in% sample(unique(event_id),
min(200, length(unique(event_id))))],
by=c('well', 'plate', 'day')]
#grab channel map from cd4 data (cd4/8 are the same)
channel_map <- diff_opt_cd4$channel_map
# for now use all variables in the channel map
umap_vars <- c(paste0(names(channel_map),'_t'))
#cast it so variables are columns and
#subset sample umap data on variables
umap_cast_dt <- dcast(umap_dt,
event_id + well + plate + day + t_type ~ variable,
value.var='value')
umap_dt_in <- umap_cast_dt[, ..umap_vars]
# scale each input column
umap_dt_in[ ,
(names(umap_dt_in)) := lapply(.SD, scale), .SDcols = names(umap_dt_in)]
# run umap via uwot library
umap_out <- as.data.table(umap(umap_dt_in, min_dist=chosen_dist,
n_neighbors=chosen_n_neighbor, verbose=T, n_threads=8, n_trees=50))
## 05:18:04 UMAP embedding parameters a = 1.914 b = 0.7956
## 05:18:04 Read 139760 rows and found 10 numeric columns
## 05:18:04 Using Annoy for neighbor search, n_neighbors = 15
## 05:18:04 Building Annoy index with metric = euclidean, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 05:18:28 Writing NN index file to temp file /tmp/RtmplnADUt/file1747458f2f86
## 05:18:28 Searching Annoy index using 8 threads, search_k = 1500
## 05:18:37 Annoy recall = 100%
## 05:18:37 Commencing smooth kNN distance calibration using 8 threads
## 05:18:40 Initializing from normalized Laplacian + noise
## 05:18:45 Commencing optimization for 200 epochs, with 2781532 positive edges
## 05:20:50 Optimization finished
# rename the columns
names(umap_out) <- c('umap_1','umap_2')
# add the umap output to the input dt
umap_fcs_dt <- cbind(umap_cast_dt[, 1:length(names(umap_cast_dt))], umap_out)
umap_fcs_dt[, id := 1:.N]
# add back the annotations
umap_fcs_dt <- unique(umap_dt[,
.(donor, car, k562, t_type, day, well, event_id)])[
umap_fcs_dt, on=.(t_type,well,day,event_id)]
cd4_colors = brewer.pal('Greens', n=9)[c(2,4,6,8,9)]
cd8_colors = brewer.pal('Oranges', n=9)[c(2,4,6,8,9)]
color_time <- ggplot(umap_fcs_dt[][, cd19 := (k562=='cd19+')],
aes(x=umap_1, y=umap_2, color=interaction(day, t_type))) +
geom_point(size=0.1, alpha=0.5) +
facet_grid(car~cd19) +
scale_color_manual(values=c(cd4_colors, cd8_colors)) +
guides(colour = guide_legend(override.aes = list(alpha=1, size=3)))
color_density <- ggplot(umap_fcs_dt, aes(x=umap_1, y=umap_2)) +
geom_hex(bins = 70) +
facet_grid(car~cd19) +
scale_fill_continuous(
type = "viridis", limits=c(0,30), oob=scales::squish) +
theme_bw()
color_markers <- ggplot(
melt(umap_fcs_dt, measure.vars=umap_vars)[,
value.scaled := scale(value), by='variable'][,
cd19 := (k562=='cd19+')],
aes(x=umap_1, y=umap_2, z=value.scaled)) +
stat_summary_hex(bins = 70) +
facet_grid(variable~cd19) +
scale_fill_distiller(palette='RdYlBu', limits=c(-3, 3), oob=scales::squish) +
theme_bw()
grid.arrange(color_time, color_density, color_markers, ncol=3)